Load and format stuffs

Packages

library(ggplot2)
library(reshape2)
library(cowplot)
library(vegan)
library(metabaR)

Data

# Import obitab
#--------------
tmp <-
  read.table(
    "~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez2018_s4paircontigs_s5tagmatch_s6highQ_2libs_s7uniq_s8nosingle_s9clust.tab",
    h = TRUE,
    sep = "\t"
  )

# create reads table
#--------------
reads <- t(tmp[,grep("^sample\\.", colnames(tmp))])
colnames(reads) <- tmp$id
rownames(reads) <- gsub("^sample\\.", "", rownames(reads))

# create motus table
#--------------
motus <- tmp[,grep("^sample\\.", colnames(tmp), invert = TRUE)[-1]]
rownames(motus) <- tmp$id

# Import sample information
#--------------
tmp1 <-
  read.table(
    "~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/Lez2015_2018_IDseq_editLDG.txt",
    h = TRUE,
    sep = "\t", 
    quote="", fill=FALSE
  )

# create pcrs table
#--------------
pcrs <- data.frame(plate_no = tmp1$plate_number,
                   plate_col = tmp1$COLUMN,
                   plate_row = tmp1$ROW,
                   tag_fwd = sapply(strsplit(tmp1$tags, ":"), "[[", 1),
                   tag_rev = sapply(strsplit(tmp1$tags, ":"), "[[", 2),
                   primer_fwd = "GGATTAGATACCCTGGTAGT",
                   primer_rev = "CACGACACGAGCTGACG",
                   sample_id = tmp1$ID.seq,
                   id = tmp1$ID,
                   row.names = tmp1$ID.seq)

# specify pcr/control types in pcrs table
pcrs$type <- "sample"
pcrs$control_type <- NA

#field controls
pcrs$type[grep("ouvert", tmp1$ID)] <- "control" 
pcrs$control_type[grep("ouvert", tmp1$ID)] <- "positive" # trick to include these particular controls since values for metabaR controls are fixed (extraction/pcr/sequencing/positive/NA). Added issue in metabaR GitHub for more flex for future version of the package.

#extraction controls
# only one "controle_extraction"... I will consider the field controls "ferme" as extraction controls too. NAs seem to be extraction controls too.
pcrs$type[grep("extraction|ferme", tmp1$ID)] <- "control" 
pcrs$control_type[grep("extraction|ferme", tmp1$ID)] <- "extraction" 
pcrs$type[is.na(tmp1$ID)] <- "control" 
pcrs$control_type[is.na(tmp1$ID)] <- "extraction" 


#pcrs controls (but may be extraction controls...)
pcrs$type[grep("BLANK", tmp1$ID)] <- "control" 
pcrs$control_type[grep("BLANK", tmp1$ID)] <- "pcr" 

#sequencing controls
# NUs 
pcrs$type[grep("NU", tmp1$ID)] <- "control" 
pcrs$control_type[grep("NU", tmp1$ID)] <- "sequencing" 

#further info relevant to exp problems
pcrs$year <- tmp1$Annee
pcrs$boite_genet <- tmp1$Boite_Genet
pcrs$year <- tmp1$Annee
pcrs$control_closed <- tmp1$C_ferme
pcrs$control_open <- tmp1$C_ouvert
pcrs$clo_quality <- tmp1$Qualite_Clo

# create sample table
#--------------
# todo amend later, include only some infos from tmp1
id <- which(pcrs$type=="sample")
samples <- data.frame(id = tmp1$ID[id],
                      elev = tmp1$Elev[id], 
                      age = tmp1$Age[id],
                      sex = tmp1$Sexe[id], 
                      pop = tmp1$Pop[id],
                      notes = tmp1$Remarques[id], 
                      year = tmp1$Annee[id],
                      sampling_date = tmp1$Date.Prelevement[id],
                      nprel = tmp1$Nprel[id],
                      qual_clo = tmp1$Qualite_Clo[id],
                      feces = tmp1$Feces[id],
                      divprel = tmp1$Divers.Prelevements[id],
                      row.names = tmp1$ID.seq[id])


# create the metabarlist object
#--------------

mblist <- metabarlist_generator(reads = reads, 
                                motus = motus,
                                pcrs = pcrs,
                                samples = samples)
## Warning in check_metabarlist(out): Some PCRs in out have a number of reads of
## zero in table `reads`!
# add taxonomic annotation
mblist <-
  silva_annotator(metabarlist = mblist,
                  silva.path = "~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez_2015_2018---ssu---otus.csv",
                  clust.path = "~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez_2015_2018---ssu---sequence_cluster_map---lez2018_s4paircontigs_s5tagmatch_s6highQ_2libs_s7uniq_s8nosingle_s9clust_s10keepone.clstr")
## Warning in check_metabarlist(metabarlist): Some PCRs in metabarlist have a
## number of reads of zero in table `reads`!

Data overview

Data available and summary statistics

Overview per pcr type

colnames(mblist$motus) 
##  [1] "GC_content"         "ID"                 "ali_length"        
##  [4] "class"              "cluster"            "cluster_center"    
##  [7] "cluster_score"      "cluster_weight"     "count"             
## [10] "direction"          "distance"           "experiment"        
## [13] "forward_match"      "forward_primer"     "forward_score"     
## [16] "forward_tag"        "mode"               "primers"           
## [19] "reverse_match"      "reverse_primer"     "reverse_score"     
## [22] "reverse_tag"        "sampling_date"      "seq_a_deletion"    
## [25] "seq_a_insertion"    "seq_a_mismatch"     "seq_a_single"      
## [28] "seq_ab_match"       "seq_b_deletion"     "seq_b_insertion"   
## [31] "seq_b_mismatch"     "seq_b_single"       "seq_length"        
## [34] "seq_length_ori"     "status"             "sequence"          
## [37] "similarity"         "superkingdom_silva" "kingdom_silva"     
## [40] "phylum_silva"       "class_silva"        "order_silva"       
## [43] "family_silva"       "genus_silva"        "lineage_silva"
colnames(mblist$pcrs) 
##  [1] "plate_no"       "plate_col"      "plate_row"      "tag_fwd"       
##  [5] "tag_rev"        "primer_fwd"     "primer_rev"     "sample_id"     
##  [9] "id"             "type"           "control_type"   "year"          
## [13] "control_closed" "control_open"   "clo_quality"
colnames(mblist$samples) 
##  [1] "id"            "elev"          "age"           "sex"          
##  [5] "pop"           "notes"         "year"          "sampling_date"
##  [9] "nprel"         "qual_clo"      "feces"         "divprel"
summary_metabarlist(mblist)
## $dataset_dimension
##         n_row n_col
## reads    1440  8070
## motus    8070    45
## pcrs     1440    15
## samples   986    12
## 
## $dataset_statistics
##         nb_reads nb_motus avg_reads sd_reads avg_motus sd_motus
## pcrs    11183625     8070  7766.406 13850.99  126.3639 165.1573
## samples 11124692     8047 11282.649 15520.82  175.5984 178.1532

Distribution of # reads and motus in each pcr, expecting no (or few) MOTUs/reads in sequencing, PCR and extraction controls. NA labels in the plots shown downstream will correspond to samples and positive controls to field open controls.

# store # reads and motus / pcr
mblist$pcrs$nb_reads <- rowSums(mblist$reads)
mblist$pcrs$nb_motus <- rowSums(mblist$reads>0)

# plot 
check1 <- melt(mblist$pcrs[,c("control_type", "nb_reads", "nb_motus")])
ggplot(data = check1, aes(x = control_type, y = value, color = control_type)) +
  geom_boxplot() +
  theme_bw() +
  scale_color_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") +
  facet_wrap(~variable, scales = "free_y") +
  theme(axis.text.x = element_text(angle=45, h=1))

Few reads, but non negligible amounts of MOTUs in negative controls.

Overview in the PCR design context

ggpcrtag(mblist, legend_title = "# of reads per PCR",
         FUN = function(m) {
           rowSums(m$reads)
         },
         taglist = as.character(unique(mblist$pcrs$tag_rev)))

Sequencing control distribution strange (i.e. does not allow testing all tags).
Many samples with low sequencing depth. Let’s hope their signal is not similar to sequencing blanks. Potential problem with all samples with reverse tag “gcgtcagc”. This is the same tag that we spotted in the Poland data (generated in a different place)… => can be a pb of the tag itself in the reverse position?

NB: Appearance of an artifactual NA line at the top, not to be considered (ggpcrtag function pb? cannot identify it for now. Distribution of samples congruent with what indicated in “NGSfilters_lezards.xlsx”).

==> Flag samples with potentially corrupted reverse tag/

mblist$pcrs$pcrbias <- ifelse(mblist$pcrs$tag_rev == "gcgtcagc", "tag pb", NA)

Rarefaction curves

Rarefaction curves computed with the Hill numbers for \(q={0,1,2}\). They are equivalent to richness (\(q=0\)), the exponential of the Shannon index (\(q=1\)) and the inverse of the Simpson index (\(q=2\)). They give more or less weight to rare species that may correspond to molecular artifacts. The Good’s coverage index (i.e. proportion of motus that are not singletons) is also provide.

raref <- hill_rarefaction(mblist, nboot = 20, nsteps = 10)
saveRDS(raref, "~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez2015_2018_rarcruves.rds")

Vizualisation according to control types

# create vector with control type info for each pcr designated by their pcr names
control_type <- mblist$pcrs$control_type
names(control_type) <- rownames(mblist$pcrs)

raref = readRDS("~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez2015_2018_rarcruves.rds")

gghill_rarefaction(raref, group = control_type) +
  scale_color_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") +
  scale_fill_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") +
  labs(color="control_type")

Ok, most controls not visible, saturation at \(^1D\) in all cases as expected.

Correlation between sequencing depth and richness

Check whether the number of MOTUs and reads are correlated. We expect them to be especially correlated if many rare taxa are present, knowung that they are most likely errors. Should be visible in sequencing controls or samples with super high sequencing depth (higher likelihood of sequencing errors of abundant motus).

#also compute diversity at q=1
mblist$pcrs$q1 <- exp(diversity(mblist$reads, "shannon"))

# plot
tmp <- melt(mblist$pcrs[c("nb_reads", "nb_motus", "q1", "control_type")], 
            id.var = c("nb_reads", "control_type"))
tmp$variable <- gsub("nb_motus", "q0", tmp$variable)

ggplot(tmp, aes(x=nb_reads, y=value, color = control_type)) +
  geom_point(alpha=0.2) + theme_bw() +
  scale_y_log10() + scale_x_log10() +
  scale_color_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") + 
  facet_wrap(~variable) + 
  labs(y="diversity") + 
  theme(legend.position = "bottom")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis

Looks good. Samples with low # reads do not behave like sequencing controls.

Flagging contaminants and suprious signal

Exogenous contaminants

Identify sequences more abundant (in frequency over the whole dataset) in negative controls.

# field
mblist <- contaslayer(mblist,
                      controls =
                        rownames(mblist$pcrs)[which(mblist$pcrs$control_type ==
                                                      "positive")],
                      output_col = "not_a_fieldopen_contaminant")

# extraction
mblist <- contaslayer(mblist,
                      controls =
                        rownames(mblist$pcrs)[which(mblist$pcrs$control_type ==
                                                      "extraction")],
                      output_col = "not_an_extraction_contaminant")

# pcr
mblist <- contaslayer(mblist,
                      controls =
                        rownames(mblist$pcrs)[which(mblist$pcrs$control_type ==
                                                      "pcr")],
                      output_col = "not_a_pcr_contaminant")

# sequencing
mblist <- contaslayer(mblist,
                      controls =
                        rownames(mblist$pcrs)[which(mblist$pcrs$control_type ==
                                                      "sequencing")],
                      output_col = "not_a_sequencing_contaminant")

# check if they overlap
table(rowSums(!mblist$motus[,grep("^not_a", colnames(mblist$motus))]))
## 
##    0    1 
## 7987   83
# do not overlap ... 

# flag contaminant sequences
mblist$motus$bias <- NA
mblist$motus$bias[which(!mblist$motus$not_a_fieldopen_contaminant)] <- "conta fieldopen"
mblist$motus$bias[which(!mblist$motus$not_an_extraction_contaminant)] <- "conta extraction"
mblist$motus$bias[which(!mblist$motus$not_a_pcr_contaminant)] <- "conta pcr"
mblist$motus$bias[which(!mblist$motus$not_a_sequencing_contaminant)] <- "conta sequencing" #0

library(kableExtra)
dt <- mblist$motus[!is.na(mblist$motus$bias), c("count", "similarity", "lineage_silva", "bias")]

kable(dt[order(dt[,1], decreasing=T),]) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                 font_size = 8)
count similarity lineage_silva bias
M02944:302:000000000-C5NTF:1:1102:6533:3428_CONS_SUB_SUB 2044 100.00 Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Pantoea; conta fieldopen
M02944:302:000000000-C5NTF:1:1102:11787:2373_CONS_SUB_SUB 1109 100.00 Bacteria;Actinobacteria;Actinobacteria;Propionibacteriales;Propionibacteriaceae;Cutibacterium; conta fieldopen
M02944:302:000000000-C5NTF:1:1102:18068:2552_CONS_SUB_SUB 1048 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Caulobacterales;Caulobacteraceae;uncultured; conta fieldopen
M02944:302:000000000-C5NTF:1:1102:20248:4898_CONS_SUB_SUB_CMP 611 100.00 Bacteria;Actinobacteria;Actinobacteria;Micrococcales;Intrasporangiaceae;Janibacter; conta fieldopen
M02944:302:000000000-C5NTF:1:1102:18194:23454_CONS_SUB_SUB_CMP 311 100.00 Bacteria;Actinobacteria;Actinobacteria;Micrococcales;Micrococcaceae;Micrococcus; conta fieldopen
M02944:302:000000000-C5NTF:1:2112:6450:18874_CONS_SUB_SUB 306 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Rhodobacterales;Rhodobacteraceae;Paracoccus; conta fieldopen
M02944:302:000000000-C5NTF:1:1102:18259:6772_CONS_SUB_SUB_CMP 276 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Rhodobacterales;Rhodobacteraceae;Paracoccus; conta fieldopen
M02944:302:000000000-C5NTF:1:1102:26296:14637_CONS_SUB_SUB 94 99.81 Bacteria;Proteobacteria;Gammaproteobacteria;Xanthomonadales;Xanthomonadaceae;Pseudoxanthomonas; conta fieldopen
M02944:302:000000000-C5NTF:1:1102:14820:15754_CONS_SUB_SUB 69 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Rhodobacterales;Rhodobacteraceae;Paracoccus; conta fieldopen
M02944:302:000000000-C5NTF:1:1102:4586:12680_CONS_SUB_SUB_CMP 60 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Caulobacterales;Caulobacteraceae;Brevundimonas; conta fieldopen
M02944:302:000000000-C5NTF:1:1101:13951:5153_CONS_SUB_SUB_CMP 52 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Novosphingobium; conta fieldopen
M02944:302:000000000-C5NTF:1:2111:7786:7084_CONS_SUB_SUB 46 99.23 Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Sphingomonas; conta fieldopen
M02944:302:000000000-C5NTF:1:2114:20654:17967_CONS_SUB_SUB 35 100.00 Bacteria;Proteobacteria;Gammaproteobacteria;Pasteurellales;Pasteurellaceae;Haemophilus; conta fieldopen
M02944:302:000000000-C5NTF:1:2115:5248:5005_CONS_SUB_SUB_CMP 32 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Sphingomonas; conta fieldopen
M02944:302:000000000-C5NTF:1:2110:21140:20335_CONS_SUB_SUB 28 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Rhodobacterales;Rhodobacteraceae;Paracoccus; conta fieldopen
M02944:302:000000000-C5NTF:1:2112:14487:4063_CONS_SUB_SUB_CMP 25 100.00 Bacteria;Bacteroidetes;Bacteroidia;Sphingobacteriales;Sphingobacteriaceae;Pedobacter; conta fieldopen
M02944:302:000000000-C5NTF:1:1102:14741:6417_CONS_SUB_SUB_CMP 15 95.27 Bacteria;Proteobacteria;Alphaproteobacteria;Thalassobaculales;uncultured; conta fieldopen
M02944:302:000000000-C5NTF:1:2106:4760:13375_CONS_SUB_SUB_CMP 14 100.00 Bacteria;Proteobacteria;Gammaproteobacteria;Oceanospirillales;Halomonadaceae;Halomonas; conta fieldopen
M02944:302:000000000-C5NTF:1:1102:24835:7732_CONS_SUB_SUB 14 100.00 Bacteria;Actinobacteria;Actinobacteria;Micrococcales;Microbacteriaceae;Galbitalea; conta extraction
M02944:302:000000000-C5NTF:1:1101:15462:20664_CONS_SUB_SUB 13 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Rhodobacterales;Rhodobacteraceae;Paracoccus; conta fieldopen
M02944:302:000000000-C5NTF:1:1101:10358:4588_CONS_SUB_SUB_CMP 13 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Ellin6055; conta fieldopen
M02944:302:000000000-C5NTF:1:1111:12930:3349_CONS_SUB_SUB 11 100.00 Bacteria;Firmicutes;Negativicutes;Selenomonadales;Veillonellaceae;Veillonella; conta fieldopen
M02944:302:000000000-C5NTF:1:1112:16267:16761_CONS_SUB_SUB 11 97.07 Bacteria;Actinobacteria;Coriobacteriia;Coriobacteriales;Coriobacteriaceae;Collinsella; conta extraction
M02944:302:000000000-C5NTF:1:2113:12392:2891_CONS_SUB_SUB_CMP 10 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Altererythrobacter; conta extraction
M02944:302:000000000-C5NTF:1:2113:11358:18807_CONS_SUB_SUB_CMP 9 100.00 Bacteria;Actinobacteria;Actinobacteria;Corynebacteriales;Corynebacteriaceae;Corynebacterium 1; conta extraction
M02944:302:000000000-C5NTF:1:2109:29037:13131_CONS_SUB_SUB_CMP 9 100.00 Bacteria;Firmicutes;Bacilli;Bacillales;Family XI;Gemella; conta extraction
M02944:302:000000000-C5NTF:1:1116:16243:8855_CONS_SUB_SUB 8 99.81 Bacteria;Actinobacteria;Actinobacteria;Kineosporiales;Kineosporiaceae;Kineosporia; conta extraction
M02944:302:000000000-C5NTF:1:2105:9191:9805_CONS_SUB_SUB_CMP 8 86.20 Bacteria;Epsilonbacteraeota;Campylobacteria;Campylobacterales;Helicobacteraceae;Helicobacter; conta extraction
M02944:302:000000000-C5NTF:1:2114:6512:11004_CONS_SUB_SUB 7 99.81 Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Pseudomonadaceae;Pseudomonas; conta fieldopen
M02944:302:000000000-C5NTF:1:2104:15750:6432_CONS_SUB_SUB_CMP 7 97.49 Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Mobilitalea; conta extraction
M02944:302:000000000-C5NTF:1:2108:9981:18629_CONS_SUB_SUB_CMP 7 87.50 Bacteria;Epsilonbacteraeota;Campylobacteria;Campylobacterales;Helicobacteraceae;Helicobacter; conta fieldopen
M02944:302:000000000-C5NTF:1:2108:15503:15034_CONS_SUB_SUB_CMP 7 100.00 Bacteria;Proteobacteria;Gammaproteobacteria;Betaproteobacteriales;Burkholderiaceae;uncultured; conta extraction
M02944:302:000000000-C5NTF:1:2115:18620:24734_CONS_SUB_SUB_CMP 7 99.61 Bacteria;Bacteroidetes;Bacteroidia;Cytophagales;Spirosomaceae;Spirosoma; conta fieldopen
M02944:302:000000000-C5NTF:1:1101:23049:19885_CONS_SUB_SUB 7 96.73 Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Lachnospiraceae UCG-008; conta extraction
M02944:302:000000000-C5NTF:1:2113:13297:6472_CONS_SUB_SUB_CMP 7 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Beijerinckiaceae;Microvirga; conta fieldopen
M02944:302:000000000-C5NTF:1:1101:24408:13225_CONS_SUB_SUB_CMP 7 99.03 Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Robinsoniella; conta fieldopen
M02944:302:000000000-C5NTF:1:1101:19336:16054_CONS_SUB_SUB 6 100.00 Bacteria;Proteobacteria;Gammaproteobacteria;Betaproteobacteriales;Methylophilaceae;uncultured; conta extraction
M02944:302:000000000-C5NTF:1:2115:23975:12882_CONS_SUB_SUB 6 99.81 Bacteria;Actinobacteria;Actinobacteria;Micrococcales;Micrococcaceae;Kocuria; conta fieldopen
M02944:302:000000000-C5NTF:1:2107:25916:8830_CONS_SUB_SUB 5 100.00 Bacteria;Proteobacteria;Gammaproteobacteria;Pasteurellales;Pasteurellaceae;Actinobacillus; conta fieldopen
M02944:302:000000000-C5NTF:1:1102:6720:12462_CONS_SUB_SUB 5 98.41 Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides; conta extraction
M02944:302:000000000-C5NTF:1:2105:17441:8760_CONS_SUB_SUB 5 96.51 Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Lachnospiraceae NK3A20 group; conta extraction
M02944:302:000000000-C5NTF:1:2112:17378:4484_CONS_SUB_SUB_CMP 5 99.81 Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Sphingopyxis; conta fieldopen
M02944:302:000000000-C5NTF:1:1112:14910:12745_CONS_SUB_SUB_CMP 5 96.00 Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Rikenellaceae;Rikenella; conta fieldopen
M02944:302:000000000-C5NTF:1:2114:27440:8582_CONS_SUB_SUB_CMP 4 100.00 Bacteria;Proteobacteria;Gammaproteobacteria;Betaproteobacteriales;Burkholderiaceae;Rhizobacter; conta extraction
M02944:302:000000000-C5NTF:1:2111:12145:14422_CONS_SUB_SUB_CMP 4 100.00 Bacteria;Actinobacteria;Actinobacteria;Corynebacteriales;Corynebacteriaceae;Turicella; conta extraction
M02944:302:000000000-C5NTF:1:1108:6125:12384_CONS_SUB_SUB 4 100.00 Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Dysgonomonadaceae;Dysgonomonas; conta extraction
M02944:302:000000000-C5NTF:1:2115:2578:14900_CONS_SUB_SUB_CMP 4 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Caulobacterales;Caulobacteraceae;Asticcacaulis; conta extraction
M02944:302:000000000-C5NTF:1:1108:11664:10574_CONS_SUB_SUB 4 99.81 Bacteria;Actinobacteria;Actinobacteria;Micrococcales;Brevibacteriaceae;Brevibacterium; conta fieldopen
M02944:302:000000000-C5NTF:1:1111:4503:9208_CONS_SUB_SUB_CMP 4 96.34 Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Ruminococcus 1; conta extraction
M02944:302:000000000-C5NTF:1:1105:21258:17018_CONS_SUB_SUB_CMP 4 96.88 Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Faecalibacterium; conta fieldopen
M02944:302:000000000-C5NTF:1:2110:21651:19187_CONS_SUB_SUB_CMP 4 95.00 Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Faecalibacterium; conta extraction
M02944:302:000000000-C5NTF:1:2104:25502:9867_CONS_SUB_SUB_CMP 4 89.16 Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Ruminococcaceae UCG-014; conta extraction
M02944:302:000000000-C5NTF:1:2113:21235:9318_CONS_SUB_SUB 3 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Caulobacterales;Caulobacteraceae;Caulobacter; conta extraction
M02944:302:000000000-C5NTF:1:1115:20591:7956_CONS_SUB_SUB_CMP 3 95.91 Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Rhizobiaceae;Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium; conta extraction
M02944:302:000000000-C5NTF:1:1106:24209:10282_CONS_SUB_SUB 3 87.75 Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides; conta extraction
M02944:302:000000000-C5NTF:1:1112:10846:10094_CONS_SUB_SUB 3 86.19 Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Ruminiclostridium 6; conta fieldopen
M02944:302:000000000-C5NTF:1:2116:14852:8020_CONS_SUB_SUB_CMP 3 94.10 Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;ASF356; conta fieldopen
M02944:302:000000000-C5NTF:1:1106:28966:14237_CONS_SUB_SUB 3 88.26 Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides; conta pcr
M02944:302:000000000-C5NTF:1:1110:8393:19940_CONS_SUB_SUB_CMP 2 96.51 Bacteria;Firmicutes;Clostridia;Clostridiales;Family XIII;Family XIII UCG-001; conta fieldopen
M02944:302:000000000-C5NTF:1:2114:8675:6203_CONS_SUB_SUB 2 94.19 Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Lachnospiraceae NK3A20 group; conta fieldopen
M02944:302:000000000-C5NTF:1:2107:25297:23116_CONS_SUB_SUB 2 99.81 Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus; conta pcr
M02944:302:000000000-C5NTF:1:2112:25256:10745_CONS_SUB_SUB 2 99.81 Bacteria;Proteobacteria;Deltaproteobacteria;Oligoflexales;Oligoflexaceae;Oligoflexus; conta extraction
M02944:302:000000000-C5NTF:1:1106:26438:16644_CONS_SUB_SUB_CMP 2 97.10 Bacteria;Firmicutes;Clostridia;Clostridiales;Family XIII;uncultured; conta fieldopen
M02944:302:000000000-C5NTF:1:1111:19295:22326_CONS_SUB_SUB_CMP 2 99.81 Bacteria;Actinobacteria;Acidimicrobiia;Microtrichales;Ilumatobacteraceae;CL500-29 marine group; conta fieldopen
M02944:302:000000000-C5NTF:1:1102:14654:8734_CONS_SUB_SUB 2 100.00 Bacteria;Deinococcus-Thermus;Deinococci;Deinococcales;Deinococcaceae;Deinococcus; conta fieldopen
M02944:302:000000000-C5NTF:1:1115:13182:19432_CONS_SUB_SUB_CMP 2 95.37 Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Paludibacteraceae;Paludibacter; conta extraction
M02944:302:000000000-C5NTF:1:2102:4700:11012_CONS_SUB_SUB 2 96.90 Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus; conta fieldopen
M02944:302:000000000-C5NTF:1:2109:19958:20100_CONS_SUB_SUB 2 100.00 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales;Marinobacteraceae;Marinobacter; conta fieldopen
M02944:302:000000000-C5NTF:1:1105:10700:11474_CONS_SUB_SUB 2 100.00 Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Porphyromonadaceae;Porphyromonas; conta fieldopen
M02944:302:000000000-C5NTF:1:2103:8240:21223_CONS_SUB_SUB_CMP 2 88.00 Bacteria;Epsilonbacteraeota;Campylobacteria;Campylobacterales;Helicobacteraceae;Helicobacter; conta extraction
M02944:302:000000000-C5NTF:1:2113:17367:22840_CONS_SUB_SUB 2 99.62 Bacteria;BRC1; conta fieldopen
M02944:302:000000000-C5NTF:1:2115:24129:14093_CONS_SUB_SUB_CMP 2 96.14 Bacteria;Firmicutes;Bacilli;Lactobacillales;P5D1-392; conta fieldopen
M02944:302:000000000-C5NTF:1:2110:21441:3238_CONS_SUB_SUB_CMP 2 96.31 Bacteria;Proteobacteria;Deltaproteobacteria;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio; conta fieldopen
M02944:302:000000000-C5NTF:1:2108:18727:17807_CONS_SUB_SUB_CMP 2 100.00 Bacteria;Actinobacteria;Coriobacteriia;OPB41; conta fieldopen
M02944:302:000000000-C5NTF:1:2109:10408:10892_CONS_SUB_SUB 2 98.65 Bacteria;Proteobacteria;Gammaproteobacteria;Betaproteobacteriales;Burkholderiaceae;Limnobacter; conta extraction
M02944:302:000000000-C5NTF:1:1113:18203:8055_CONS_SUB_SUB 2 97.62 Bacteria;Bacteroidetes;Bacteroidia;Flavobacteriales;Weeksellaceae;Chryseobacterium; conta extraction
M02944:302:000000000-C5NTF:1:1101:17541:24684_CONS_SUB_SUB 2 98.25 Bacteria;Chlamydiae;Chlamydiae;Chlamydiales;Parachlamydiaceae;uncultured; conta extraction
M02944:302:000000000-C5NTF:1:1101:7017:6084_CONS_SUB_SUB 2 96.40 Bacteria;Epsilonbacteraeota;Campylobacteria;Campylobacterales;Helicobacteraceae;Helicobacter; conta extraction
M02944:302:000000000-C5NTF:1:2106:12685:12487_CONS_SUB_SUB_CMP 2 99.80 Bacteria;Firmicutes;Clostridia;Clostridiales;Family XI;Peptoniphilus; conta extraction
M02944:302:000000000-C5NTF:1:1110:29102:15287_CONS_SUB_SUB_CMP 2 99.62 Bacteria;Proteobacteria;Gammaproteobacteria;Xanthomonadales;Xanthomonadaceae;Xanthomonas; conta extraction
M02944:302:000000000-C5NTF:1:2104:16591:18431_CONS_SUB_SUB_CMP 2 95.69 Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides; conta fieldopen
M02944:302:000000000-C5NTF:1:2106:15440:22800_CONS_SUB_SUB 2 90.50 Bacteria;Fusobacteria;Fusobacteriia;Fusobacteriales;Fusobacteriaceae;Fusobacterium; conta extraction
M02944:302:000000000-C5NTF:1:1111:19632:6206_CONS_SUB_SUB_CMP 2 100.00 Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Sphingorhabdus; conta pcr

Distribution of the top 3 contaminants in negative controls on the pcr plates.

for (i in 1:3){
  max.conta <- rownames(dt)[order(dt$count, decreasing=T)[i]]
  p <- ggpcrtag(mblist, legend_title = "# of reads of the most abundant conta",
         FUN = function(m) {m$reads[,max.conta]},
         taglist = as.character(unique(mblist$pcrs$tag_rev)))
  print(as.vector(mblist$motus[max.conta,"lineage_silva"]))
  print(p)
}
## [1] "Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Pantoea;"

## [1] "Bacteria;Actinobacteria;Actinobacteria;Propionibacteriales;Propionibacteriaceae;Cutibacterium;"

## [1] "Bacteria;Proteobacteria;Alphaproteobacteria;Caulobacterales;Caulobacteraceae;uncultured;"

The first is a plant pathogen and responsible for some infection in humans, the second is associated with human skin (one sp member of that genus cause acne).

Distribution in the whole dataset

tmp <- melt(decostand(mblist$reads, "total")[,!is.na(mblist$motus$bias)])
tmp$sample_type <- mblist$pcrs$control_type[match(tmp$Var1, rownames(mblist$pcrs))]
tmp$conta_type <- gsub(" ", "_", mblist$motus$bias[match(tmp$Var2, rownames(mblist$motus))])

ggplot(tmp, aes(x=sample_type, y=value)) + 
  geom_boxplot(color="grey40") +
  geom_jitter(aes(color=sample_type), width = 0.2, alpha=0.5) +
  facet_wrap(~conta_type, ncol=4, 
             labeller = label_parsed) +
  scale_color_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") +
  labs(x=NULL, y="Prop. Reads (log10)") + 
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text.x = element_text(angle=45, hjust=1), 
        strip.background = element_blank(), 
        strip.text = element_text(face="bold", hjust=0),
        legend.position = "bottom") + 
  scale_y_log10()

Motus detected as contaminants are overall in low abundance in samples, except for field open sample.. Difficult to say if its indeed true contaminants or contaminations with biological samples at this stage…

Flag samples with too many contaminants.

tmp <-
  data.frame(conta.relab = rowSums(decostand(mblist$reads, "total")[, !is.na(mblist$motus$bias)]))
tmp$sample_type <- mblist$pcrs$control_type[match(rownames(tmp), rownames(mblist$pcrs))]

p = ggplot(tmp, aes(x=sample_type, y=conta.relab)) + 
  geom_boxplot(color="grey40") +
  geom_jitter(aes(color=sample_type), width = 0.2, alpha=0.5) +
  scale_color_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") +
  labs(x=NULL, y="Prop. Reads (log10)") + 
  theme_bw() + 
  scale_y_log10()

thresh <- 5e-02
p + 
  geom_hline(yintercept = thresh, lty=2, col="darkgreen", lwd=1) +
  annotate("text", x=1, y=thresh, label="potential failed PCR: too many contaminants", 
           col="darkgreen", hjust=0, fontface="bold")

mblist$pcrs$pcrbias[which(tmp$conta.relab[match(rownames(mblist$pcrs), rownames(tmp))] > thresh)] <-
  ifelse(
    is.na(mblist$pcrs$pcrbias[which(tmp$conta.relab[match(rownames(mblist$pcrs), rownames(tmp))] >
                                      thresh)]),  "too much conta",
    paste(mblist$pcrs$pcrbias[which(tmp$conta.relab[match(rownames(mblist$pcrs), rownames(tmp))] >
                                      thresh)], "too much conta", sep = "|")
  )

Untargeted clades

Exclude archaea (not targeted by the primers -> biased amplificaiton), and eukaryotes mitochondria or chloroplasts.

idx <- grep("Bacteria", mblist$motus$lineage_silva, invert = TRUE)
conta_bio <- rownames(mblist$motus)[idx]

#tag motus accordingly (append with previous bias identified)
mblist$motus$bias[match(conta_bio, rownames(mblist$motus))] <- 
  ifelse(is.na(mblist$motus$bias[match(conta_bio, rownames(mblist$motus))]), "untargeted clade",
         paste(mblist$motus$bias[match(conta_bio, rownames(mblist$motus))], "untargeted clade",
               sep="|"))

#show them in a table
dt <- mblist$motus[conta_bio, c("count", "similarity", "lineage_silva", "bias")]

kable(dt[order(dt[,1], decreasing=T),]) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                 font_size = 8)
count similarity lineage_silva bias
M02944:302:000000000-C5NTF:1:1102:19828:10474_CONS_SUB_SUB 79 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:1102:26062:21194_CONS_SUB_SUB 30 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2104:4756:19425_CONS_SUB_SUB 27 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:1102:24563:19138_CONS_SUB_SUB 26 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2111:15208:24009_CONS_SUB_SUB_CMP 21 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2112:24836:11578_CONS_SUB_SUB_CMP 20 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2115:21232:10051_CONS_SUB_SUB 17 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:1108:7601:7020_CONS_SUB_SUB_CMP 9 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:1105:20803:23631_CONS_SUB_SUB 6 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2115:15013:17213_CONS_SUB_SUB 5 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2119:3851:11610_CONS_SUB_SUB_CMP 5 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2113:27150:5316_CONS_SUB_SUB 4 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2107:9536:16765_CONS_SUB_SUB 4 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2112:18642:18647_CONS_SUB_SUB 3 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2110:23135:6738_CONS_SUB_SUB_CMP 3 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:1102:22699:18659_CONS_SUB_SUB 3 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2105:11769:6823_CONS_SUB_SUB 3 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2112:12661:17767_CONS_SUB_SUB_CMP 3 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2104:7016:11544_CONS_SUB_SUB 3 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:1109:6445:10780_CONS_SUB_SUB 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2106:19509:21253_CONS_SUB_SUB 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2115:25205:16362_CONS_SUB_SUB_CMP 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:1103:26303:16803_CONS_SUB_SUB 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:1110:3058:16325_CONS_SUB_SUB_CMP 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2103:26934:4823_CONS_SUB_SUB_CMP 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2103:26813:5575_CONS_SUB_SUB 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:1111:17595:14021_CONS_SUB_SUB 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2107:10920:13279_CONS_SUB_SUB 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2106:5336:9643_CONS_SUB_SUB 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2102:11197:10513_CONS_SUB_SUB 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:1116:17849:6243_CONS_SUB_SUB_CMP 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2117:11781:7738_CONS_SUB_SUB 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2106:29005:14968_CONS_SUB_SUB_CMP 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:1112:4204:11818_CONS_SUB_SUB 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2111:18455:14087_CONS_SUB_SUB 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2103:26901:4810_CONS_SUB_SUB_CMP 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:1101:22318:17024_CONS_SUB_SUB 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2113:18410:12985_CONS_SUB_SUB 2 NA NA untargeted clade
M02944:302:000000000-C5NTF:1:2115:18257:15709_CONS_SUB_SUB_CMP 2 NA NA untargeted clade

Either super artifactual sequences or unkown bugs; a blast on ncbi also returns no match too. In any cases, they remain rare in the dataset.

Degraded sequences

Assumption that low % similarity corresponds to degraded sequences,since the 16S is well conserved and now relatively well covered (and the fact that 75% similarity is not far from being a random alignment).

a <- 
  ggplot(mblist$motus, aes(x=similarity)) + 
    geom_histogram(color="grey", fill="white", bins=20) + 
    theme_bw() + scale_x_continuous(limits=c(60, 100)) +
    theme(panel.grid = element_blank()) + 
    labs(x="% similarity against best match", y="# MOTUs")


thresh <- 80
a <- 
  a + geom_vline(xintercept = thresh, lty=2, color="darkgreen") +
    annotate("text", x=thresh-0.02, y=250,
             label="Potential\ndegraded MOTUs", col="darkgreen", hjust=1, fontface="bold") 
  

b <-  
  ggplot(mblist$motus, aes(x=similarity, y = ..count.., weight = count)) + 
    geom_histogram(color="grey", fill="white", bins=20) + 
    theme_bw() + scale_x_continuous(limits=c(60, 100)) +
    theme(panel.grid = element_blank()) + 
    labs(x="% similarity against best match", y="# Reads")

b <- b + geom_vline(xintercept = thresh, lty=2, color="darkgreen") +
      annotate("text", x=thresh-0.02, y=3e6, 
               label="Potential highly\ndegraded MOTUs", col="darkgreen", hjust=1, fontface="bold")  
 
ggdraw() + 
  draw_plot(a, x=0, y=0, width = 0.5) + 
  draw_plot(b, x=0.5, y=0, width = 0.5)

#Flag corresponding motus 
conta.deg <- rownames(mblist$motus)[which(mblist$motus$similarity<thresh)]
mblist$motus$bias[match(conta.deg, rownames(mblist$motus))] <- 
  ifelse(is.na(mblist$motus$bias[match(conta.deg, rownames(mblist$motus))]), "conta deg",
         paste(mblist$motus$bias[match(conta.deg, rownames(mblist$motus))], "conta deg", sep="|"))

Only one such a sequence -> must have been filtered before, maybe directly in the SILVAngs pipeline (?).

Tag-jumps

One expects abundant genuine MOTUs to be found in low abundance in samples were they are not supposed to be. The rate of tag-jumps is difficult to anticipate and depends on several things (mainly how the library was prepared). One way to assess this is to remove rare MOTUs at different thresholds of abudnance and see when the sequencing controls are empty.

#define thresholds
thresh <- c(0,1e-5, 1e-4,1e-3, 1e-2, 3e-2, 5e-2) 
tests <- lapply(thresh, function(x) tagjumpslayer(mblist,x, method = "cut")) # can be changed to "substract" but not sure it is appropriate
names(tests) <- thresh

#format for plot
tmp <- melt(as.matrix(do.call("rbind", lapply(tests, function(x) rowSums(x$reads)))))
colnames(tmp) <- c("thresh", "sample", "abund_contaout")
tmp$rich_contaout <-
  melt(as.matrix(do.call("rbind", lapply(tests, function(x) {
    specnumber(x$reads[, is.na(mblist$motus$bias)])
  }))))$value
tmp$controls = mblist$pcrs$control_type[match(tmp$sample, rownames(mblist$pcrs))]

tmp2 <- melt(tmp, id.vars=colnames(tmp)[-grep("contaout", colnames(tmp))])
tmp2$variable <- ifelse(tmp2$variable=="rich_contaout", "#MOTUs", "#Reads")

a <- 
  ggplot(tmp2, aes(x=as.factor(thresh), y=value)) + 
    geom_boxplot(color="grey40") +
    geom_jitter(aes(color=controls), width = 0.2, alpha=0.5) + 
    scale_color_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") +
    facet_wrap(~variable+controls, scale="free_y", ncol=5) + 
    theme_bw() + 
    labs(x="rel Abundance filtering threshold", y="#Reads/MOTUs") + 
    theme(panel.grid = element_blank(), 
          strip.background = element_blank(), 
          axis.text.x = element_text(angle=40, h=1), 
          legend.position = "none")

threshfix <- 0.01
a + geom_vline(xintercept = match(threshfix, c(0,1e-5, 1e-4,1e-3, 1e-2, 3e-2, 5e-2)), lty=2, 
             color="darkgreen")

Let’s plot the abundance of the most abundant MOTU before and after filtering.

nkeep <- which(thresh == threshfix)
#plot in PCR plates
ggpcrtag(mblist, legend_title = "Most abundant MOTU abundance before tagjump curation",
         FUN = function(m) {mblist$reads[,which.max(mblist$motus$count)]},
         taglist = as.character(unique(mblist$pcrs$tag_rev)))

#plot in PCR plates
ggpcrtag(mblist, legend_title = "Most abundant MOTU abundance after tagjump curation",
         FUN = function(m) {tests[[nkeep]]$reads[,which.max(mblist$motus$count)]},
         taglist = as.character(unique(mblist$pcrs$tag_rev)))

Dysfunctional pcrs

Identify pcrs with low amounts of reads.

a <- 
  ggplot(mblist$pcrs, aes(nb_reads)) +
    geom_histogram(bins=40, color="grey", fill="white") + 
    scale_x_log10() + 
    labs(x="# Reads (with all OTUs and PCRs)", 
        y="# PCRs") +
    theme_bw() + 
    theme(panel.grid = element_blank())

thresh = 1e2 #based on what can be found in sequencing controls.
a + geom_vline(xintercept = thresh, lty=2, color="darkgreen")
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 289 rows containing non-finite values (stat_bin).

mblist$pcrs$pcrbias[which(mblist$pcrs$nb_reads<thresh)] = 
  ifelse(is.na(mblist$pcrs$pcrbias[which(mblist$pcrs$nb_reads<thresh)]), "pcr empty",
         paste(mblist$pcrs$pcrbias[which(mblist$pcrs$nb_reads<thresh)],
               "pcr empty", sep="|"))

Noise overview

summary(as.factor(mblist$motus$bias))
##        conta deg conta extraction  conta fieldopen        conta pcr 
##                1               33               47                3 
## untargeted clade             NA's 
##               39             7947
#motus
paste("MOTUs considered as contaminant/spurious sequences: ",
      round(sum(!is.na(mblist$motus$bias)) / length(mblist$motus$bias) * 100, 3),
      " %")
## [1] "MOTUs considered as contaminant/spurious sequences:  1.524  %"
#pcrs
paste("PCRs (control excluded) considered as 'dysfunctional': ",
      round(sum(!is.na(mblist$pcrs$pcrbias) & (mblist$pcrs$type == "sample")) / length(mblist$pcrs$pcrbias) * 100, 3),
      " %")
## [1] "PCRs (control excluded) considered as 'dysfunctional':  3.125  %"
#plots
tmp <- as.data.frame(table(mblist$motus$bias, useNA = "ifany"))
tmp$Perc <- tmp$Freq/sum(tmp$Freq)
tmp$max <- cumsum(tmp$Perc)
tmp$min <- c(0, head(tmp$max, n=-1))
set.seed(9)
cola <- sample(colors(), nlevels(tmp$Var1))
a <- 
  ggplot(tmp, aes(ymax=max, ymin=min, xmax=4, xmin=3, fill=Var1)) +
  geom_rect() + xlim(c(2, 4)) + labs(fill="Bias type") + 
  coord_polar(theta="y") + theme_void() + 
  scale_fill_manual(values = cola, na.value = "darkgrey") + 
  theme(legend.position = "bottom", legend.direction = "vertical")

tmp <- as.data.frame(table(mblist$pcrs$pcrbias[mblist$pcrs$type=="sample"], useNA = "ifany"))
tmp$Perc <- tmp$Freq/sum(tmp$Freq)
tmp$max <- cumsum(tmp$Perc)
tmp$min <- c(0, head(tmp$max, n=-1))
set.seed(3)
colb <- sample(colors(), nlevels(tmp$Var1))
b <- 
  ggplot(tmp, aes(ymax=max, ymin=min, xmax=4, xmin=3, fill=Var1)) +
  geom_rect() + xlim(c(2, 4)) + labs(fill="Bias type") + 
  coord_polar(theta="y") + theme_void() + 
  scale_fill_manual(values = colb, na.value = "darkgrey") + 
  theme(legend.position = "bottom", legend.direction = "vertical")

ggdraw() + 
  draw_plot(a, x=0, y=0, width=0.5, height=1) + 
  draw_label("MOTUs", x=0.1, y=0.9) + 
  draw_plot(b, x=0.5, y=0, width=0.5, height=1) + 
  draw_label("PCRs", x=0.6, y=0.9)

Clean and compare

Removal of suprious MOTUs, PCRs as well as adjusting read counts by removing tagjumps. Also ensure that exclude any control.

tmp <- tests[[nkeep]]
tmp$motus$bias <- mblist$motus$bias
tmp$pcrs$pcrbias <- mblist$pcrs$pcrbias
tmp1 <- subset_metabarlist(subset_metabarlist(tmp, "motus", is.na(tmp$motus$bias)),
                           "pcrs", is.na(tmp$pcrs$pcrbias))

table(tmp1$pcrs$type)
## 
## control  sample 
##     102     941
mblist_clean <- subset_metabarlist(tmp1, "pcrs", tmp1$pcrs$type=="sample")

summary_metabarlist(mblist)
## $dataset_dimension
##         n_row n_col
## reads    1440  8070
## motus    8070    50
## pcrs     1440    19
## samples   986    12
## 
## $dataset_statistics
##         nb_reads nb_motus avg_reads sd_reads avg_motus sd_motus
## pcrs    11183625     8070  7766.406 13850.99  126.3639 165.1573
## samples 11124692     8047 11282.649 15520.82  175.5984 178.1532
summary_metabarlist(tmp1)
## $dataset_dimension
##         n_row n_col
## reads    1043  7944
## motus    7944    50
## pcrs     1043    19
## samples   941    12
## 
## $dataset_statistics
##         nb_reads nb_motus avg_reads sd_reads avg_motus sd_motus
## pcrs    11038919     7944  10583.81 15315.19  136.1198 178.1605
## samples 10999397     7943  11689.05 15728.86  148.7088 183.1015
summary_metabarlist(mblist_clean)
## $dataset_dimension
##         n_row n_col
## reads     941  7943
## motus    7943    50
## pcrs      941    19
## samples   941    12
## 
## $dataset_statistics
##         nb_reads nb_motus avg_reads sd_reads avg_motus sd_motus
## pcrs    10999397     7943  11689.05 15728.86  148.7088 183.1015
## samples 10999397     7943  11689.05 15728.86  148.7088 183.1015

Now ensure removing any empty PCR or MOTUs (caused by removal of spurious MOTU or PCR)

if(sum(colSums(mblist_clean$reads)==0)>0){print("empty motus present")}
if(sum(colSums(mblist_clean$reads)==0)>0){print("empty pcrs present")}

Update some parameters in the metabarlist (e.g. counts, etc.)

mblist_clean$motus$count <- colSums(mblist_clean$reads)
mblist_clean$pcrs$obitools.reads <- mblist_clean$pcrs$nb_reads
mblist_clean$pcrs$nb_reads <- mblist_clean$pcrs$metabaR.reads <- rowSums(mblist_clean$reads)
mblist_clean$pcrs$obitools.otus <- mblist_clean$pcrs$nb_motus
mblist_clean$pcrs$nb_motus <-
  mblist_clean$pcrs$metabaR.otus <-
  rowSums(ifelse(mblist_clean$reads > 0, T, F))

Compare before and after trimming gross stats

check.bioinfo <-
  rbind(data.frame(melt(as.matrix(mblist_clean$pcrs)[, 
                                                     grep("\\.reads", colnames(mblist_clean$pcrs))]),
                   type = "reads"),
        data.frame(melt(as.matrix(mblist_clean$pcrs)[, 
                                                     grep("\\.otus", colnames(mblist_clean$pcrs))]),
                   type = "motus"))

check.bioinfo$variable <- gsub("\\.reads|\\.otus", "", check.bioinfo$Var2)
check.bioinfo$value <- as.numeric(as.vector(check.bioinfo$value))

ggplot(data = check.bioinfo, aes(x = Var2, y = value)) +
  geom_boxplot( color = "darkgrey") +
  geom_jitter(alpha=0.1, color = "darkgrey") +
  theme_bw() + labs(x="Analysis step") +
  facet_wrap(~type, scales = "free", ncol = 5) +
  theme(axis.text.x = element_text(angle=45, h=1))

Compare before and after trimming alpha diversity patterns

raref2 = hill_rarefaction(mblist_clean, nboot = 20, nsteps = 10)
saveRDS(raref2, "~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez2015_2018_rarcruves_clean.rds")
raref2 = readRDS("~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez2015_2018_rarcruves_clean.rds")

a = gghill_rarefaction(raref)

b = gghill_rarefaction(raref2) 

ggdraw() +
  draw_plot(a, x=0, y=0.5, width = 1, height = 0.5) + 
  draw_label("Raw", x=0.05, y=0.95) +
  draw_plot(b, x=0, y=0, width = 1, height = 0.5) +
  draw_label("Clean", x=0.05, y=0.55)

Compare before and after trimming beta diversity patterns

tmp <- subset_metabarlist(mblist, table = "pcr", indices = mblist$pcrs$nb_reads>0 & mblist$pcrs$type=="sample")

library(ecodist)
mds <- cmdscale(bcdist(tmp$reads), k = 2, eig = T, add = T) # bcdist from ecodist much faster than vegan's vegdist

d <- data.frame(mds$points)
d$controls <- tmp$pcrs[rownames(d), "control_type"]    
d$year <- tmp$samples[rownames(d), "year"]  

a<- ggplot(d, aes(x=X1, y=X2, color=year)) +
      geom_point() + theme_bw() +
      labs(x=paste("PCoA1 (", round(100*mds$eig[1]/sum(mds$eig),2), "%)", sep=""),
           y=paste("PCoA2 (", round(100*mds$eig[2]/sum(mds$eig),2), "%)", sep=""),
           shape="control type")

mds2 <- cmdscale(bcdist(mblist_clean$reads), k = 2, eig = T, add = T) # bcdist from ecodist much faster than vegan's vegdist

d2 <- data.frame(mds2$points)
d2$controls <- mblist_clean$pcrs[rownames(d2), "control_type"]    
d2$year <- mblist_clean$samples[rownames(d2), "year"]  

b <- ggplot(d2, aes(x=X1, y=X2, color=year)) +
      geom_point() + theme_bw() +
      labs(x=paste("PCoA1 (", round(100*mds2$eig[1]/sum(mds2$eig),2), "%)", sep=""),
           y=paste("PCoA2 (", round(100*mds2$eig[2]/sum(mds2$eig),2), "%)", sep=""),
           shape="control type")

ggdraw() +
  draw_plot(a, x=0, y=0.5, width = 1, height = 0.5) + 
  draw_label("Raw", x=0.05, y=0.95) +
  draw_plot(b, x=0, y=0, width = 1, height = 0.5) +
  draw_label("Clean", x=0.05, y=0.55)

Export

saveRDS(mblist_clean, "~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez_2015_2018_cleandata.rds")